import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# statistics
from statsmodels.distributions.empirical_distribution import ECDF
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
from datetime import date
import seaborn as sns
import random
import warnings
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)
import zipfile
import os
%matplotlib.inline
#!gsutil cp gs://powerful-bucket1/notebooks/Data/Customer_Demographics.xlsx /home/kanand_prime18
#!gsutil cp gs://powerful-bucket1/notebooks/Data/Customer_Transaction.xlsx /home/kanand_prime18
#!gsutil cp gs://powerful-bucket1/notebooks/Data/Store_Master.xlsx /home/kanand_prime18
#!gsutil cp gs://powerful-bucket1/notebooks/Data/Test_Set.xlsx /home/kanand_prime18
df1 = pd.read_excel("/home/kanand_prime18/Customer_Demographics.xlsx")
df2 = pd.read_excel("/home/kanand_prime18/Customer_Transaction.xlsx")
test = pd.read_excel("/home/kanand_prime18/Test_Set.xlsx")
df1.head()
df2.head()
df = pd.read_excel("/home/kanand_prime18/Store_Master.xlsx")
df.head(2)
df.shape
test.head(2)
test.shape
df1.shape
df2.shape
[i.shape for i in [df1, df2, df, test]]
"""Helper Functions to do Plotting"""
def generateLayoutBar(col_name):
"""
Generate a layout object for bar chart
"""
layout_bar = go.Layout(
autosize=False, # auto size the graph? use False if you are specifying the height and width
width=800, # height of the figure in pixels
height=600, # height of the figure in pixels
title="Distribution of {} column".format(col_name), # title of the figure
# more granular control on the title font
titlefont=dict(
family='Courier New, monospace', # font family
size=14, # size of the font
color='black' # color of the font
),
# granular control on the axes objects
xaxis=dict(
tickfont=dict(
family='Courier New, monospace', # font family
size=14, # size of ticks displayed on the x axis
color='black' # color of the font
)
),
yaxis=dict(
# range=[0,100],
title='Percentage',
titlefont=dict(
size=14,
color='black'
),
tickfont=dict(
family='Courier New, monospace', # font family
size=14, # size of ticks displayed on the y axis
color='black' # color of the font
)
),
font=dict(
family='Courier New, monospace', # font family
color="white", # color of the font
size=12 # size of the font displayed on the bar
)
)
return layout_bar
def plotBar(dataframe_name, col_name, top_n=None):
"""
Plot a bar chart for the categorical columns
Arguments:
dataframe name
categorical column name
Output:
Plot
"""
# create a table with value counts
temp = dataframe_name[col_name].value_counts()
if top_n is not None:
temp = temp.head(top_n)
# creating a Bar chart object of plotly
data = [go.Bar(
x=temp.index.astype(str), # x axis values
y=np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100, # y axis values
text=['{}%'.format(i) for i in np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100],
# text to be displayed on the bar, we are doing this to display the '%' symbol along with the number on the bar
textposition='auto', # specify at which position on the bar the text should appear
marker=dict(color='#0047AB'),)] # change color of the bar
# color used here Cobalt Blue
layout_bar = generateLayoutBar(col_name=col_name)
fig = go.Figure(data=data, layout=layout_bar)
return iplot(fig)
df1.shape
df1.describe()
df1.dtypes
from datetime import datetime
datetime.strptime('Jun 1 2005 1:33PM', '%b %d %Y %I:%M%p')
df1.First_txn_dt[0]
t = datetime.strptime('02NOV2010:00:00:00', '%d%b%Y:%H:%M:%S')
date_cols_demo = [ i for i in df1.columns if (i.endswith('dt')) or ('date' in i)]
date_cols_demo
df1.First_txn_dt.dtype
df1[date_cols_demo] = df1[date_cols_demo].astype('O')
def convertToDate(x):
try:
return datetime.strptime(x, '%d%b%Y:%H:%M:%S')
except:
return x
df1[date_cols_demo] = df1[date_cols_demo].applymap(lambda x : convertToDate(x))
date_cols_demo
def extractColTypes(dataset):
"""This functions extracts numeric, categorical , datetime and boolean column types.
Returns 4 lists with respective column types"""
num_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['int64','float64']]
cat_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['object']]
date_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['datetime64[ns]']]
bool_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['bool']]
print ("Numeric Columns:", len(num_cols_list))
print ("Categorical/Character Columns:", len(cat_cols_list))
print ("Date Columns:",len(date_cols_list))
print ("Boolean Columns:",len(bool_cols_list))
return(num_cols_list,cat_cols_list,date_cols_list,bool_cols_list)
demo_num_cols_list,demo_cat_cols_list,demo_date_cols_list,demo_bool_cols_list = extractColTypes(df1)
demo_cat_cols_list
today = convertToDate('24NOV2018:00:00:00')
def getDaysDiff(x):
t = -1*(x - today)
return t.apply(lambda x : x.days)
df1['custSince'] = getDaysDiff(df1.First_txn_dt)
df1['ds_accr']= getDaysDiff(df1.Last_accr_txn_dt)
df1['ds_rdm']= getDaysDiff(df1.Last_rdm_txn_dt)
for i in demo_cat_cols_list[1:]:
plotBar(df1, i,top_n=10)
Some columns here like the Income and Marital status are unspecified, as of now we cannot decide if they will help in the prediction, but we should keep a close watch for these.
demo_num_cols_list
def plotHist(dataframe, col_name):
data = [go.Histogram(x=dataframe[col_name],
marker=dict(
color='#CC0E1D',# Lava (#CC0E1D)
# color = 'rgb(200,0,0)' # you can provide color in HEX format or rgb format, genrally programmers prefer HEX format as it is a single string value and easy to pass as a variable
))]
layout = go.Layout(title = "Histogram of {}".format(col_name))
fig = go.Figure(data= data, layout=layout)
return iplot(fig)
plotHist(df1, 'Age')
Looks like the age group of 30-40 are frequent visitors of the store, also we see some entries above 100, I will remove them
plotHist(df1, 'Points')
Most people have 0-2000 points
df1[df1.Points>2000].shape
Just above 1000 customers have more than 2000 points
Let us see if these customers are distinguishable from the rest
for i in demo_cat_cols_list[1:]:
plotBar(df1[df1.Points>2000], i,top_n=10)
The distribution of all the above columns look very similar to those of the unfiltered data. There is no disctinguishing factor as such.
def plotMultiBox(dataframe,col_name, num_col_name):
data = []
for i in dataframe[col_name].unique():
trace = go.Box(y = dataframe[num_col_name][dataframe[col_name] == i],
name = i)
data.append(trace)
layout = go.Layout(title="Boxplot of levels in {} for {} column".format(col_name,num_col_name))
fig = go.Figure(data=data, layout=layout)
return (iplot(fig))
plotMultiBox(df1, 'Loyalty_Status', 'custSince')
We see that in general, gold customers are with the company longer
plotMultiBox(df1, 'Gender', 'custSince')
Females have been customers for a marginally more number of days than males.
plotMultiBox(df1, 'Income_Range', 'custSince')
df2.columns
df2.dtypes
df2.head(20)
plotHist(df2, 'Revenue')
df2.Customer_ID = df2.Customer_ID.astype('O')
tran_num_cols_list, tran_cat_cols_list, tran_date_cols_list, tran_bool_cols_list = extractColTypes(df2)
tran_cat_cols_list.remove('Customer_ID')
for i in tran_cat_cols_list:
plotBar(df2, i,top_n=10)
df1[['Last_accr_txn_dt', 'Last_rdm_txn_dt']].isnull().sum()
df1['recent_tran_date'] = df1[['Last_accr_txn_dt', 'Last_rdm_txn_dt']].apply(max, axis = 1)
df1[['Last_accr_txn_dt', 'Last_rdm_txn_dt','recent_tran_date']].head()
cust_recent_tran_date = df1[['Customer_ID','recent_tran_date']]
cust_recent_tran_date.isnull().sum()
# same functions used before
def getRecency(x):
t = -1*(x - today)
return t.apply(lambda x : x.days)
custId_Recency = pd.DataFrame(cust_recent_tran_date['Customer_ID'])
custId_Recency['Recency'] = getDaysDiff(cust_recent_tran_date.recent_tran_date)
custId_Recency.shape
custId_Recency.head()
df2.columns
df2.Transaction_Type.unique()
df2[df2.Transaction_Type != 'Return'].shape
df2[df2.Transaction_Type == 'Return'].Invoices = -1 * df2[df2.Transaction_Type == 'Return'].Invoices
custId_Frequency = df2.groupby(['Customer_ID']).agg({'Invoices': np.sum}).reset_index()
custId_Frequency.columns = ['Customer_ID', 'Frequency']
custId_Frequency.shape
custId_Frequency.head()
custId_Frequency.Frequency[custId_Frequency.Frequency<0]
custId_Monetary = df2.groupby(['Customer_ID']).agg({'Revenue': np.sum}).reset_index()
custId_Monetary.columns = ['Customer_ID','MonetaryValue']
custId_Monetary.shape
custId_Monetary.head()
custId_RFM = custId_Recency.join(custId_Frequency.set_index('Customer_ID'),on='Customer_ID')
custId_RFM = custId_RFM.join(custId_Monetary.set_index('Customer_ID'),on='Customer_ID',)
custId_RFM.head()
custId_RFM['RecencyRank'] = pd.qcut(custId_RFM.Recency,q = 5, labels = False)
custId_RFM['FrequencyRank'] = pd.qcut(custId_RFM.Frequency,q = 5, labels = False, duplicates = 'drop')
custId_RFM['MonetaryValueRank'] = pd.qcut(custId_RFM.MonetaryValue,q = 5, labels = False , duplicates = 'drop')
custId_RFM.shape
custId_RFM.head()
custId_RFM.isnull().sum()
custId_RFM.dropna(inplace=True)
custId_RFM.RecencyRank.unique()
for i in custId_RFM.columns:
if 'Rank' in i:
print (i)
custId_RFM[i] = custId_RFM[i].apply(lambda x : str(int(np.round(x)+1)))
custId_RFM.RecencyRank.unique()
custId_RFM.head()
custId_RFM['RFMScore'] = custId_RFM.RecencyRank + custId_RFM.FrequencyRank + custId_RFM.MonetaryValueRank
custId_RFM.head()
custId_RFM.dtypes
datapath = '/home/kanand_prime18/'
custId_RFM.to_csv(datapath + "RFM.csv",index = False)
custId_RFM.RFMScore.value_counts()
segments_dict = {'155':'Best_Customers',
'145':'Best_Customers',
'154':'Best_Customers',
'244':'Best_Customers',
'254':'Best_Customers',
'245':'Best_Customers',
'135': 'High_Spending_New_Customers',
'235': 'High_Spending_New_Customers',
'134': 'High_Spending_New_Customers',
'125': 'High_Spending_New_Customers',
'124': 'High_Spending_New_Customers',
'225': 'High_Spending_New_Customers',
'151': 'Lowest_Spending_Active_Loyal_Customers',
'152': 'Lowest_Spending_Active_Loyal_Customers',
'153': 'Lowest_Spending_Active_Loyal_Customers',
'141': 'Lowest_Spending_Active_Loyal_Customers',
'142': 'Lowest_Spending_Active_Loyal_Customers',
'143': 'Lowest_Spending_Active_Loyal_Customers',
'251': 'Lowest_Spending_Active_Loyal_Customers',
'252': 'Lowest_Spending_Active_Loyal_Customers',
'253': 'Lowest_Spending_Active_Loyal_Customers',
'515': 'Curned_Best_Customers',
'514': 'Curned_Best_Customers',
'525': 'Curned_Best_Customers',
'524': 'Curned_Best_Customers',
'415': 'Curned_Best_Customers',
'414': 'Curned_Best_Customers',
}
custId_RFM['CustomerSegment'] = custId_RFM.RFMScore
custId_RFM['CustomerSegment'] = custId_RFM['CustomerSegment'].replace(segments_dict)
## Best_Customers
Best_Customers = go.Scatter3d(
x=custId_RFM.Recency[custId_RFM['CustomerSegment']=='Best_Customers'],
y=custId_RFM.Frequency[custId_RFM['CustomerSegment']=='Best_Customers'],
z=custId_RFM.MonetaryValue[custId_RFM['CustomerSegment']=='Best_Customers'],
mode='markers',name ='Best_Customers',
marker=dict(
size=4,
opacity=0.6
)
)
## High_Spending_New_Customers
High_Spending_New_Customers = go.Scatter3d(
x=custId_RFM.Recency[custId_RFM['CustomerSegment']=='High_Spending_New_Customers'],
y=custId_RFM.Frequency[custId_RFM['CustomerSegment']=='High_Spending_New_Customers'],
z=custId_RFM.MonetaryValue[custId_RFM['CustomerSegment']=='High_Spending_New_Customers'],
mode='markers',name ='High_Spending_New_Customers',
marker=dict(
size=4,
opacity=0.6
)
)
## Lowest_Spending_Active_Loyal_Customers
Lowest_Spending_Active_Loyal_Customers = go.Scatter3d(
x=custId_RFM.Recency[custId_RFM['CustomerSegment']=='Lowest_Spending_Active_Loyal_Customers'],
y=custId_RFM.Frequency[custId_RFM['CustomerSegment']=='Lowest_Spending_Active_Loyal_Customers'],
z=custId_RFM.MonetaryValue[custId_RFM['CustomerSegment']=='Lowest_Spending_Active_Loyal_Customers'],
mode='markers',name ='Lowest_Spending_Active_Loyal_Customers',
marker=dict(
size=4,
opacity=0.6
)
)
## Curned_Best_Customers
Curned_Best_Customers = go.Scatter3d(
x=custId_RFM.Recency[custId_RFM['CustomerSegment']=='Curned_Best_Customers'],
y=custId_RFM.Frequency[custId_RFM['CustomerSegment']=='Curned_Best_Customers'],
z=custId_RFM.MonetaryValue[custId_RFM['CustomerSegment']=='Curned_Best_Customers'],
mode='markers',name ='Curned_Best_Customers',
marker=dict(
size=4,
opacity=0.6
)
)
data = [Best_Customers,High_Spending_New_Customers, Lowest_Spending_Active_Loyal_Customers ,Curned_Best_Customers]
layout = go.Layout(
scene = dict(
xaxis = dict(
title='Recency',
backgroundcolor="black",
showbackground=True,
titlefont=dict(
size=16,
color='black'
)
),
yaxis = dict(
title='Frequency',
showbackground=True,
backgroundcolor="black",
titlefont=dict(
size=16,
color='black'
)
),
zaxis = dict(
title='MonetaryValue',
backgroundcolor="black",
showbackground=True,
titlefont=dict(
size=16,
color='black'
)
)
),
width=1000, # height of the figure in pixels
height=800, # height of the figure in pixels
margin = dict( b =15),)
fig = go.Figure(data=data, layout=layout)
fig['layout'].update(title= "RFM Customer Segmentation")
iplot(fig)